In [ ]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import when
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import approx_count_distinct
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
Criando Sessao e Lendo CSV¶
In [ ]:
spark = SparkSession.builder \
.appName("Session Atividade 2") \
.getOrCreate()
customer = spark.read.csv("/user/customer.csv", inferSchema=True, header=True, sep=";")
Previa do CSV¶
In [ ]:
customer.show()
+-----------+--------+---------+-----+---------+---+--------------------+-------------+---+------------+------+---------------+-------------+---------------+----------------+--------------+--------------+----------+---------------+----------------------+-----------------------+----------------------+---------------+-----------+-----------+--------------+------------------+-------------+----------------+--------+---------+ |CUSTOMER_ID| LAST| FIRST|STATE| REGION|SEX| PROFESSION|BUY_INSURANCE|AGE|HAS_CHILDREN|SALARY|N_OF_DEPENDENTS|CAR_OWNERSHIP|HOUSE_OWNERSHIP|TIME_AS_CUSTOMER|MARITAL_STATUS|CREDIT_BALANCE|BANK_FUNDS|CHECKING_AMOUNT|MONEY_MONTLY_OVERDRAWN|T_AMOUNT_AUTOM_PAYMENTS|MONTHLY_CHECKS_WRITTEN|MORTGAGE_AMOUNT|N_TRANS_ATM|N_MORTGAGES|N_TRANS_TELLER|CREDIT_CARD_LIMITS|N_TRANS_KIOSK|N_TRANS_WEB_BANK| LTV| LTV_BIN| +-----------+--------+---------+-----+---------+---+--------------------+-------------+---+------------+------+---------------+-------------+---------------+----------------+--------------+--------------+----------+---------------+----------------------+-----------------------+----------------------+---------------+-----------+-----------+--------------+------------------+-------------+----------------+--------+---------+ | CU8589| KALA| IVERSON| WI| Midwest| F| PROF-9| No| 49| 1| 68696| 1| 1| 1| 1| WIDOWED| 0| 16100| 25| 53,14| 1749| 4| 5500| 2| 1| 5| 800| 1| 3700| 25574| HIGH| | CU9823| NESTOR| HORTON| CA| West| M| Nurse| No| 24| 0| 73850| 0| 1| 0| 2| SINGLE| 0| 0| 25| 53,06| 504| 0| 0| 0| 0| 0| 1500| 1| 0| 21862,5| MEDIUM| | CU12175| DWIGHT| ROLLINS| MI| Midwest| M|Programmer/Developer| No| 26| 1| 60249| 2| 1| 1| 2| MARRIED| 452| 500| 134| 53,42| 625| 4| 1036| 4| 1| 2| 1000| 4| 1036|19662,25| MEDIUM| | CU7924| GRADY| STATON| NY|NorthEast| M|Programmer/Developer| No| 32| 0| 60466| 1| 1| 0| 1| SINGLE| 0| 650| 265| 53,18| 278| 17| 0| 3| 0| 2| 700| 3| 0| 16816,5| MEDIUM| | CU14284|DOMINICK| LAW| NY|NorthEast| M|Construction Laborer| No| 24| 0| 76570| 3| 1| 1| 3| SINGLE| 0| 0| 25| 53,06| 0| 1| 358| 0| 1| 0| 1500| 1| 358| 27042,5| HIGH| | CU14620| DANIAL| CORLEY| UT|Southwest| M| Truck Driver| No| 35| 1| 62756| 1| 1| 1| 1| MARRIED| 0| 501| 1382| 53,32| 1636| 8| 1500| 3| 1| 2| 500| 1| 1500| 22689| HIGH| | CU15186| VERNON| NIETO| NY|NorthEast| F| IT Staff| No| 36| 1| 62886| 1| 1| 1| 1| MARRIED| 0| 600| 25| 53,22| 757| 2| 1020| 3| 1| 2| 1000| 1| 1020| 22821,5| HIGH| | CU5165| KITTY| COFFEY| MI| Midwest| F| Truck Driver| No| 26| 1| 61012| 1| 1| 1| 1| MARRIED| 0| 2400| 1314| 53,27| 2299| 3| 1300| 3| 1| 1| 1100| 0| 1300| 21353| MEDIUM| | CU5938| TEDDY| MURILLO| CA| West| M| Author| No| 78| 1| 65134| 0| 1| 0| 2| SINGLE| 0| 0| 25| 53,08| 514| 4| 0| 1| 0| 0| 900| 0| 0| 20083,5| MEDIUM| | CU3296| MARYLEE| HATHAWAY| CA| West| F| Not specified| No| 49| 1| 60322| 3| 1| 1| 4| WIDOWED| 30650| 9550| 266| 53,73| 33850| 12| 6500| 4| 1| 4| 900| 2| 1700| 20980,5| MEDIUM| | CU1069| THURMAN| VAUGHAN| MN| West| M| PROF-15| No| 80| 1| 63067| 0| 1| 0| 2| SINGLE| 0| 0| 25| 53,06| 504| 0| 0| 0| 0| 1| 1400| 0| 0|19766,75| MEDIUM| | CU13543| DARYL| ADDISON| WA| West| F| PROF-39| No| 52| 1| 73090| 5| 1| 2| 5| DIVORCED| 0| 1750| 25| 53,33| 249| 10| 2728| 3| 2| 3| 1500| 1| 2728| 30972,5|VERY HIGH| | CU11717| LAVETA| CASEY| NY|NorthEast| F| Publisher| No| 31| 0| 71782| 5| 1| 1| 5| DIVORCED| 0| 0| 34| 53,02| 0| 0| 2200| 2| 1| 1| 1500| 1| 2200| 23545,5| HIGH| | CU5928| ROBERTO|CORNELIUS| NY|NorthEast| M| Childcare Worker| No| 44| 1| 61056| 1| 1| 1| 1| DIVORCED| 0| 0| 461| 53,15| 965| 0| 2200| 2| 1| 1| 1000| 0| 2200| 23164| HIGH| | CU10012| DORSEY| PRIEST| NM|Southwest| M| Cashier| No| 52| 0| 59480| 3| 1| 1| 4| MARRIED| 0| 0| 25| 53,06| 0| 0| 1001| 0| 1| 0| 700| 1| 1001| 26070| HIGH| | CU197| ENRIQUE| BARRIOS| MI| Midwest| M| Nurse| No| 77| 0| 73461| 0| 1| 0| 2| SINGLE| 0| 0| 25| 53,06| 598| 0| 0| 0| 0| 0| 2500| 0| 0|27065,25| HIGH| | CU476| CHONG| CROWDER| CA| West| M|Administrative As...| No| 69| 1| 61943| 0| 1| 1| 2| MARRIED| 0| 0| 1466| 53,12| 1971| 0| 909| 2| 1| 1| 600| 0| 909|28385,75| HIGH| | CU9110| JOHN| CLARK| DC|NorthEast| M|Administrative As...| No| 46| 1| 64648| 0| 1| 1| 2| DIVORCED| 0| 0| 25| 53,06| 0| 0| 2500| 0| 1| 1| 800| 1| 2500| 26762| HIGH| | CU7291| LAUREN| LAYNE| MI| Midwest| M| Nurse| No| 42| 1| 58327| 0| 1| 0| 4| SINGLE| 0| 5100| 1926| 53,45| 2570| 4| 0| 4| 0| 2| 800| 1| 1500|15781,75| MEDIUM| | CU7148| VINCENT| COBB| OK| Midwest| M|Programmer/Developer| No| 28| 0| 60968| 2| 1| 1| 2| MARRIED| 0| 750| 206| 53.00| 725| 3| 700| 2| 1| 3| 900| 1| 700| 25042| HIGH| +-----------+--------+---------+-----+---------+---+--------------------+-------------+---+------------+------+---------------+-------------+---------------+----------------+--------------+--------------+----------+---------------+----------------------+-----------------------+----------------------+---------------+-----------+-----------+--------------+------------------+-------------+----------------+--------+---------+ only showing top 20 rows
Target = BUY_INSURANCE, aplicar Arvore de decisão¶
Verificando tipos¶
In [ ]:
customer.printSchema()
root |-- CUSTOMER_ID: string (nullable = true) |-- LAST: string (nullable = true) |-- FIRST: string (nullable = true) |-- STATE: string (nullable = true) |-- REGION: string (nullable = true) |-- SEX: string (nullable = true) |-- PROFESSION: string (nullable = true) |-- BUY_INSURANCE: string (nullable = true) |-- AGE: integer (nullable = true) |-- HAS_CHILDREN: integer (nullable = true) |-- SALARY: integer (nullable = true) |-- N_OF_DEPENDENTS: integer (nullable = true) |-- CAR_OWNERSHIP: integer (nullable = true) |-- HOUSE_OWNERSHIP: integer (nullable = true) |-- TIME_AS_CUSTOMER: integer (nullable = true) |-- MARITAL_STATUS: string (nullable = true) |-- CREDIT_BALANCE: integer (nullable = true) |-- BANK_FUNDS: integer (nullable = true) |-- CHECKING_AMOUNT: integer (nullable = true) |-- MONEY_MONTLY_OVERDRAWN: string (nullable = true) |-- T_AMOUNT_AUTOM_PAYMENTS: integer (nullable = true) |-- MONTHLY_CHECKS_WRITTEN: integer (nullable = true) |-- MORTGAGE_AMOUNT: integer (nullable = true) |-- N_TRANS_ATM: integer (nullable = true) |-- N_MORTGAGES: integer (nullable = true) |-- N_TRANS_TELLER: integer (nullable = true) |-- CREDIT_CARD_LIMITS: integer (nullable = true) |-- N_TRANS_KIOSK: integer (nullable = true) |-- N_TRANS_WEB_BANK: integer (nullable = true) |-- LTV: string (nullable = true) |-- LTV_BIN: string (nullable = true)
Foram encontrados dados numericos como String, fazendo a correcao:¶
In [ ]:
customer = customer.withColumn("MONEY_MONTLY_OVERDRAWN", regexp_replace(col("MONEY_MONTLY_OVERDRAWN"), ",", ".").cast("float"))
customer = customer.withColumn("LTV", regexp_replace(col("LTV"), ",", ".").cast("float"))
In [ ]:
customer.show()
+-----------+--------+---------+-----+---------+---+--------------------+-------------+---+------------+------+---------------+-------------+---------------+----------------+--------------+--------------+----------+---------------+----------------------+-----------------------+----------------------+---------------+-----------+-----------+--------------+------------------+-------------+----------------+--------+---------+ |CUSTOMER_ID| LAST| FIRST|STATE| REGION|SEX| PROFESSION|BUY_INSURANCE|AGE|HAS_CHILDREN|SALARY|N_OF_DEPENDENTS|CAR_OWNERSHIP|HOUSE_OWNERSHIP|TIME_AS_CUSTOMER|MARITAL_STATUS|CREDIT_BALANCE|BANK_FUNDS|CHECKING_AMOUNT|MONEY_MONTLY_OVERDRAWN|T_AMOUNT_AUTOM_PAYMENTS|MONTHLY_CHECKS_WRITTEN|MORTGAGE_AMOUNT|N_TRANS_ATM|N_MORTGAGES|N_TRANS_TELLER|CREDIT_CARD_LIMITS|N_TRANS_KIOSK|N_TRANS_WEB_BANK| LTV| LTV_BIN| +-----------+--------+---------+-----+---------+---+--------------------+-------------+---+------------+------+---------------+-------------+---------------+----------------+--------------+--------------+----------+---------------+----------------------+-----------------------+----------------------+---------------+-----------+-----------+--------------+------------------+-------------+----------------+--------+---------+ | CU8589| KALA| IVERSON| WI| Midwest| F| PROF-9| No| 49| 1| 68696| 1| 1| 1| 1| WIDOWED| 0| 16100| 25| 53.14| 1749| 4| 5500| 2| 1| 5| 800| 1| 3700| 25574.0| HIGH| | CU9823| NESTOR| HORTON| CA| West| M| Nurse| No| 24| 0| 73850| 0| 1| 0| 2| SINGLE| 0| 0| 25| 53.06| 504| 0| 0| 0| 0| 0| 1500| 1| 0| 21862.5| MEDIUM| | CU12175| DWIGHT| ROLLINS| MI| Midwest| M|Programmer/Developer| No| 26| 1| 60249| 2| 1| 1| 2| MARRIED| 452| 500| 134| 53.42| 625| 4| 1036| 4| 1| 2| 1000| 4| 1036|19662.25| MEDIUM| | CU7924| GRADY| STATON| NY|NorthEast| M|Programmer/Developer| No| 32| 0| 60466| 1| 1| 0| 1| SINGLE| 0| 650| 265| 53.18| 278| 17| 0| 3| 0| 2| 700| 3| 0| 16816.5| MEDIUM| | CU14284|DOMINICK| LAW| NY|NorthEast| M|Construction Laborer| No| 24| 0| 76570| 3| 1| 1| 3| SINGLE| 0| 0| 25| 53.06| 0| 1| 358| 0| 1| 0| 1500| 1| 358| 27042.5| HIGH| | CU14620| DANIAL| CORLEY| UT|Southwest| M| Truck Driver| No| 35| 1| 62756| 1| 1| 1| 1| MARRIED| 0| 501| 1382| 53.32| 1636| 8| 1500| 3| 1| 2| 500| 1| 1500| 22689.0| HIGH| | CU15186| VERNON| NIETO| NY|NorthEast| F| IT Staff| No| 36| 1| 62886| 1| 1| 1| 1| MARRIED| 0| 600| 25| 53.22| 757| 2| 1020| 3| 1| 2| 1000| 1| 1020| 22821.5| HIGH| | CU5165| KITTY| COFFEY| MI| Midwest| F| Truck Driver| No| 26| 1| 61012| 1| 1| 1| 1| MARRIED| 0| 2400| 1314| 53.27| 2299| 3| 1300| 3| 1| 1| 1100| 0| 1300| 21353.0| MEDIUM| | CU5938| TEDDY| MURILLO| CA| West| M| Author| No| 78| 1| 65134| 0| 1| 0| 2| SINGLE| 0| 0| 25| 53.08| 514| 4| 0| 1| 0| 0| 900| 0| 0| 20083.5| MEDIUM| | CU3296| MARYLEE| HATHAWAY| CA| West| F| Not specified| No| 49| 1| 60322| 3| 1| 1| 4| WIDOWED| 30650| 9550| 266| 53.73| 33850| 12| 6500| 4| 1| 4| 900| 2| 1700| 20980.5| MEDIUM| | CU1069| THURMAN| VAUGHAN| MN| West| M| PROF-15| No| 80| 1| 63067| 0| 1| 0| 2| SINGLE| 0| 0| 25| 53.06| 504| 0| 0| 0| 0| 1| 1400| 0| 0|19766.75| MEDIUM| | CU13543| DARYL| ADDISON| WA| West| F| PROF-39| No| 52| 1| 73090| 5| 1| 2| 5| DIVORCED| 0| 1750| 25| 53.33| 249| 10| 2728| 3| 2| 3| 1500| 1| 2728| 30972.5|VERY HIGH| | CU11717| LAVETA| CASEY| NY|NorthEast| F| Publisher| No| 31| 0| 71782| 5| 1| 1| 5| DIVORCED| 0| 0| 34| 53.02| 0| 0| 2200| 2| 1| 1| 1500| 1| 2200| 23545.5| HIGH| | CU5928| ROBERTO|CORNELIUS| NY|NorthEast| M| Childcare Worker| No| 44| 1| 61056| 1| 1| 1| 1| DIVORCED| 0| 0| 461| 53.15| 965| 0| 2200| 2| 1| 1| 1000| 0| 2200| 23164.0| HIGH| | CU10012| DORSEY| PRIEST| NM|Southwest| M| Cashier| No| 52| 0| 59480| 3| 1| 1| 4| MARRIED| 0| 0| 25| 53.06| 0| 0| 1001| 0| 1| 0| 700| 1| 1001| 26070.0| HIGH| | CU197| ENRIQUE| BARRIOS| MI| Midwest| M| Nurse| No| 77| 0| 73461| 0| 1| 0| 2| SINGLE| 0| 0| 25| 53.06| 598| 0| 0| 0| 0| 0| 2500| 0| 0|27065.25| HIGH| | CU476| CHONG| CROWDER| CA| West| M|Administrative As...| No| 69| 1| 61943| 0| 1| 1| 2| MARRIED| 0| 0| 1466| 53.12| 1971| 0| 909| 2| 1| 1| 600| 0| 909|28385.75| HIGH| | CU9110| JOHN| CLARK| DC|NorthEast| M|Administrative As...| No| 46| 1| 64648| 0| 1| 1| 2| DIVORCED| 0| 0| 25| 53.06| 0| 0| 2500| 0| 1| 1| 800| 1| 2500| 26762.0| HIGH| | CU7291| LAUREN| LAYNE| MI| Midwest| M| Nurse| No| 42| 1| 58327| 0| 1| 0| 4| SINGLE| 0| 5100| 1926| 53.45| 2570| 4| 0| 4| 0| 2| 800| 1| 1500|15781.75| MEDIUM| | CU7148| VINCENT| COBB| OK| Midwest| M|Programmer/Developer| No| 28| 0| 60968| 2| 1| 1| 2| MARRIED| 0| 750| 206| 53.0| 725| 3| 700| 2| 1| 3| 900| 1| 700| 25042.0| HIGH| +-----------+--------+---------+-----+---------+---+--------------------+-------------+---+------------+------+---------------+-------------+---------------+----------------+--------------+--------------+----------+---------------+----------------------+-----------------------+----------------------+---------------+-----------+-----------+--------------+------------------+-------------+----------------+--------+---------+ only showing top 20 rows
Fazendo um Describe dos dados¶
In [ ]:
customer.describe().show()
+-------+-----------+-----+-------+-----+-------+----+---------------+-------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+--------------+------------------+-----------------+------------------+----------------------+-----------------------+----------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+---------+ |summary|CUSTOMER_ID| LAST| FIRST|STATE| REGION| SEX| PROFESSION|BUY_INSURANCE| AGE| HAS_CHILDREN| SALARY| N_OF_DEPENDENTS| CAR_OWNERSHIP| HOUSE_OWNERSHIP| TIME_AS_CUSTOMER|MARITAL_STATUS| CREDIT_BALANCE| BANK_FUNDS| CHECKING_AMOUNT|MONEY_MONTLY_OVERDRAWN|T_AMOUNT_AUTOM_PAYMENTS|MONTHLY_CHECKS_WRITTEN| MORTGAGE_AMOUNT| N_TRANS_ATM| N_MORTGAGES| N_TRANS_TELLER|CREDIT_CARD_LIMITS| N_TRANS_KIOSK| N_TRANS_WEB_BANK| LTV| LTV_BIN| +-------+-----------+-----+-------+-----+-------+----+---------------+-------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+--------------+------------------+-----------------+------------------+----------------------+-----------------------+----------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+---------+ | count| 1015| 1015| 1015| 1015| 1015|1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| 1015| | mean| NULL| NULL| NULL| NULL| NULL|NULL| NULL| NULL|38.190147783251234|0.5113300492610837|65102.80295566502|1.993103448275862|0.9467980295566503|0.8049261083743843|2.4285714285714284| NULL|2234.1014778325125|2639.839408866995| 1055.848275862069| 53.708157882314595| 4980.337931034483| 4.311330049261084|2066.1221674876847| 2.826600985221675|0.8049261083743843|1.7310344827586206| 1285.615763546798|1.8640394088669952|1449.6758620689654|22452.39039408867| NULL| | stddev| NULL| NULL| NULL| NULL| NULL|NULL| NULL| NULL|14.918394106597274|0.5001180377766468|6848.442846277457|1.549623522433759|0.2245466473592848|0.5077072350739917|1.2333992047170599| NULL|11727.873803601524|4996.100695402684|3124.4059848269044| 1.6850125888497685| 20443.6578124019| 4.817335938750649| 3184.931347333881|1.8914011204972785|0.5077072350739917|1.4993662085725994| 858.2066930270321|1.8260760353304921|2428.9208272335786|6579.356570077857| NULL| | min| CU100|AARON|ABRAHAM| AK|Midwest| F| Accountant| No| 0| 0| 37572| 0| 0| 0| 1| DIVORCED| 0| 0| 25| 32.16| 0| 0| 0| 0| 0| 0| 500| 0| 0| 0.0| HIGH| | max| CU9988| ZACK| ZUNIGA| WI| West| M|Waiter/Waitress| Yes| 84| 1| 109943| 6| 1| 2| 5| WIDOWED| 170498| 36000| 23476| 73.61| 499362| 18| 45000| 8| 2| 9| 5000| 10| 45000| 43101.25|VERY HIGH| +-------+-----------+-----+-------+-----+-------+----+---------------+-------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+--------------+------------------+-----------------+------------------+----------------------+-----------------------+----------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+---------+
In [ ]:
unique_value_counts = {}
for col in customer.columns:
count = customer.select(approx_count_distinct(col)).collect()[0][0]
unique_value_counts[col] = count
for col, count in unique_value_counts.items():
print(f"Column '{col}' has {count} unique values.")
Column 'CUSTOMER_ID' has 1065 unique values. Column 'LAST' has 852 unique values. Column 'FIRST' has 857 unique values. Column 'STATE' has 22 unique values. Column 'REGION' has 5 unique values. Column 'SEX' has 2 unique values. Column 'PROFESSION' has 97 unique values. Column 'BUY_INSURANCE' has 2 unique values. Column 'AGE' has 65 unique values. Column 'HAS_CHILDREN' has 2 unique values. Column 'SALARY' has 1026 unique values. Column 'N_OF_DEPENDENTS' has 7 unique values. Column 'CAR_OWNERSHIP' has 2 unique values. Column 'HOUSE_OWNERSHIP' has 3 unique values. Column 'TIME_AS_CUSTOMER' has 5 unique values. Column 'MARITAL_STATUS' has 5 unique values. Column 'CREDIT_BALANCE' has 95 unique values. Column 'BANK_FUNDS' has 272 unique values. Column 'CHECKING_AMOUNT' has 347 unique values. Column 'MONEY_MONTLY_OVERDRAWN' has 279 unique values. Column 'T_AMOUNT_AUTOM_PAYMENTS' has 679 unique values. Column 'MONTHLY_CHECKS_WRITTEN' has 19 unique values. Column 'MORTGAGE_AMOUNT' has 244 unique values. Column 'N_TRANS_ATM' has 9 unique values. Column 'N_MORTGAGES' has 3 unique values. Column 'N_TRANS_TELLER' has 9 unique values. Column 'CREDIT_CARD_LIMITS' has 24 unique values. Column 'N_TRANS_KIOSK' has 11 unique values. Column 'N_TRANS_WEB_BANK' has 228 unique values. Column 'LTV' has 1004 unique values. Column 'LTV_BIN' has 4 unique values.
Removendo dados nao relevantes¶
In [ ]:
columns_to_drop = ["CUSTOMER_ID", "LAST", "FIRST", "PROFESSION", "STATE"] # Add columns you want to drop here
customer = customer.drop(*columns_to_drop)
Verificando dados nulos¶
In [ ]:
from pyspark.sql.functions import col
null_counts = [(col_name, customer.where(col(col_name).isNull()).count()) for col_name in customer.columns]
for col_name, count in null_counts:
print(f"{col_name}: {count}")
REGION: 0 SEX: 0 BUY_INSURANCE: 0 AGE: 0 HAS_CHILDREN: 0 SALARY: 0 N_OF_DEPENDENTS: 0 CAR_OWNERSHIP: 0 HOUSE_OWNERSHIP: 0 TIME_AS_CUSTOMER: 0 MARITAL_STATUS: 0 CREDIT_BALANCE: 0 BANK_FUNDS: 0 CHECKING_AMOUNT: 0 MONEY_MONTLY_OVERDRAWN: 0 T_AMOUNT_AUTOM_PAYMENTS: 0 MONTHLY_CHECKS_WRITTEN: 0 MORTGAGE_AMOUNT: 0 N_TRANS_ATM: 0 N_MORTGAGES: 0 N_TRANS_TELLER: 0 CREDIT_CARD_LIMITS: 0 N_TRANS_KIOSK: 0 N_TRANS_WEB_BANK: 0 LTV: 0 LTV_BIN: 0
Sem dados Nulos!¶
In [ ]:
var_categoricas = ['REGION', 'SEX', 'MARITAL_STATUS', 'LTV_BIN']
indexers = [
StringIndexer(inputCol=column, outputCol=column+"_index").fit(customer)
for column in var_categoricas
]
pipeline = Pipeline(stages=indexers)
customer = pipeline.fit(customer).transform(customer)
customer = customer.drop(*var_categoricas)
feature_cols = customer.columns.copy()
feature_cols.remove("BUY_INSURANCE")
vector_assembler = VectorAssembler(
inputCols=feature_cols,
outputCol="features"
)
customer = vector_assembler.transform(customer)
customer = customer.select(
col("features").alias("features"),
col("BUY_INSURANCE").alias("label"))
customer = customer.withColumn("label", when(customer["label"] == "No", 0).otherwise(1))
customer.show()
+--------------------+-----+ | features|label| +--------------------+-----+ |[49.0,1.0,68696.0...| 0| |(25,[0,2,4,6,9,10...| 0| |[26.0,1.0,60249.0...| 0| |[32.0,0.0,60466.0...| 0| |(25,[0,2,3,4,5,6,...| 0| |[35.0,1.0,62756.0...| 0| |[36.0,1.0,62886.0...| 0| |[26.0,1.0,61012.0...| 0| |(25,[0,1,2,4,6,9,...| 0| |[49.0,1.0,60322.0...| 0| |(25,[0,1,2,4,6,9,...| 0| |[52.0,1.0,73090.0...| 0| |[31.0,0.0,71782.0...| 0| |[44.0,1.0,61056.0...| 0| |[52.0,0.0,59480.0...| 0| |(25,[0,2,4,6,9,10...| 0| |[69.0,1.0,61943.0...| 0| |[46.0,1.0,64648.0...| 0| |[42.0,1.0,58327.0...| 0| |[28.0,0.0,60968.0...| 0| +--------------------+-----+ only showing top 20 rows
In [ ]:
(train_data, test_data) = customer.randomSplit([0.7, 0.3], seed=420)
dt = DecisionTreeClassifier(featuresCol="features")
evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
paramGrid = ParamGridBuilder() \
.addGrid(dt.maxDepth, [5, 10, 15]) \
.addGrid(dt.maxBins, [20, 40, 60, 80]) \
.build()
crossval = CrossValidator(estimator=dt,
estimatorParamMaps=paramGrid,
evaluator=evaluator,
numFolds=5)
cvModel = crossval.fit(train_data)
bestModel = cvModel.bestModel
predictions = bestModel.transform(test_data)
roc_auc = evaluator.evaluate(predictions)
print(f"ROC-AUC Score: {roc_auc}")
ROC-AUC Score: 0.8796711509715994
In [ ]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator_accuracy = MulticlassClassificationEvaluator(metricName="accuracy")
evaluator_precision = MulticlassClassificationEvaluator(metricName="weightedPrecision")
evaluator_recall = MulticlassClassificationEvaluator(metricName="weightedRecall")
evaluator_f1 = MulticlassClassificationEvaluator(metricName="f1")
accuracy = evaluator_accuracy.evaluate(predictions)
precision = evaluator_precision.evaluate(predictions)
recall = evaluator_recall.evaluate(predictions)
f1_score = evaluator_f1.evaluate(predictions)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
Accuracy: 0.8498402555910544 Precision: 0.8621285880890186 Recall: 0.8498402555910544 F1 Score: 0.8533440362635478
In [ ]:
regras = bestModel.toDebugString.replace("Predict: 1.0", "BUY_INSURANCE: Yes").replace("Predict: 0.0", "BUY_INSURANCE: No")
f = 0
used_features = []
for feature in feature_cols:
regras = regras.replace(f"feature {f} ", f"{feature} ")
if feature in regras:
used_features.append(feature)
f = f +1
In [ ]:
def parse_debug_string_lines(lines):
block = []
while lines:
if lines[0].startswith('If'):
bl = ' '.join(lines.pop(0).split()[1:]).replace('(', '').replace(')', '')
block.append({'name': bl, 'children': parse_debug_string_lines(lines)})
if lines[0].startswith('Else'):
be = ' '.join(lines.pop(0).split()[1:]).replace('(', '').replace(')', '')
block.append({'name': be, 'children': parse_debug_string_lines(lines)})
elif not lines[0].startswith(('If', 'Else')):
block2 = lines.pop(0)
block.append({'name': block2})
else:
break
return block
def debug_str_to_json(debug_string):
data = []
for line in debug_string.splitlines():
if line.strip():
line = line.strip()
data.append(line)
else:
break
if not line: break
json = {'name': 'Root', 'children': parse_debug_string_lines(data[1:])}
return json
import json
dict_tree_json = debug_str_to_json(regras)
# print(json.dumps(dict_tree_json,indent = 1 ))
In [ ]:
from IPython.display import SVG, display
def show_svg():
display(SVG(filename='/home/glauco/staging/jsoncrack.com.svg'))
show_svg()
In [ ]:
#################################################
#Plotando a curva ROC
#################################################
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
import matplotlib.pyplot as plt
# Define a UDF to extract the probability of the positive class
def extract_probability(probability_vector):
return float(probability_vector[1])
extract_prob_udf = udf(extract_probability, FloatType())
# Update the DataFrame to include a column for the positive class probability
predictions = predictions.withColumn("positive_probability", extract_prob_udf("probability"))
# Now compute the ROC curve
thresholds = [i / 100.0 for i in range(100)] # Creating a list of threshold values from 0 to 1
roc_points = []
for threshold in thresholds:
# Create a new column 'predicted_label' based on the probability threshold
prediction_with_threshold = predictions.withColumn('predicted_label', (col("positive_probability") >= threshold).cast("integer"))
# True Positive (TP): correctly predicted positive observations
TP = prediction_with_threshold.filter('predicted_label = 1 AND label = 1').count()
# False Positive (FP): incorrectly predicted positive observations
FP = prediction_with_threshold.filter('predicted_label = 1 AND label = 0').count()
# True Negative (TN): correctly predicted negative observations
TN = prediction_with_threshold.filter('predicted_label = 0 AND label = 0').count()
# False Negative (FN): incorrectly predicted negative observations
FN = prediction_with_threshold.filter('predicted_label = 0 AND label = 1').count()
# Calculate True Positive Rate (TPR) and False Positive Rate (FPR)
TPR = TP / (TP + FN) if (TP + FN) else 0
FPR = FP / (FP + TN) if (FP + TN) else 0
roc_points.append((FPR, TPR))
# Extract FPR and TPR into separate lists for plotting
fpr, tpr = zip(*roc_points)
# Plot ROC curve using Matplotlib
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()